Only frey, mcclain, cindex, sihouette and dunn can be computed. To compute the other indices, data matrix is needed
Dendrogram
Branches are colored according to the groups obtained by maximizing the silhouette width.
Tables Centroids & Mediods
Sites ordered according to their clusters delineated by hierarchical cluster analysis (Wards method) and according to their squared sum of differences to the centroid.
Sites closest to the centroid and mediods for each cluster.
Clusters & z-standardized values of environmental parameters
Numbers indicate each delineated cluster. Values for each variable are z-standardised (Stand_value), i.e. they can be interpreted in terms of differences in standard deviations from the mean. Colored sites are those sites that have the overall smallest sum of squared differences to the centroid.
Source Code
---title: Sample site selection author: Stefan Kunzdate: last-modifiedformat: html: self-contained: true number-sections: false number-depth: 3 anchor-sections: true code-tools: true code-fold: false code-link: false code-block-bg: "#f1f3f5" code-block-border-left: "#31BAE9" mainfont: Source Sans Pro theme: journal toc: true toc-depth: 3 toc-location: left captions: true cap-location: margin table-captions: true tbl-cap-location: margin reference-location: margincomments: hypothesis: trueexecute: warning: false message: false echo : falseeditor: visualeditor_options: chunk_output_type: console---```{r}# TODO: # - Tweak optimal number of groups, use a different approach?# - season?# - Plot for the categorical variables?source("/home/kunzst/Dokumente/Side_projects/Allan_sampling_sites/R/Set_up.R",local = knitr::knit_global())``````{r}# load dataraw_data <-read.csv2(file.path(data_in, "Raw_data_field_2022_Allan_Narjes.csv"),dec =".",sep =",",na.strings =c(" ",""))#str(raw_data)col_remain <-names(raw_data)[!names(raw_data) %in%c("coordinate", "Land.scape", "Date", "Uhrzeit")]raw_data <- raw_data[, col_remain]# Some columns have "," as dec instead of "."# + Some have NWG < ...raw_data <-apply(raw_data, 2, function(y)sub("\\,", ".", y))# Replace NWG values with half of the level of determinationraw_data <-apply(raw_data, 2, function(y)sub("< NWG.+", "0", y))raw_data <-as.data.frame(raw_data)# Create id# In columns "Ort" and "number.of.sample" is sometimes whitespace at the end or beginningraw_data$Ort <-stri_trim(raw_data$Ort, side ="right")raw_data$number.of.sample <-stri_trim(raw_data$number.of.sample, side ="left")raw_data$Site <-paste0(raw_data$Ort,"_",raw_data$number.of.sample)rownames(raw_data) <- raw_data$Site# simplify col namesnames(raw_data) <-sub("\\..mg\\.L\\.", "", names(raw_data))names(raw_data) <-sub("\\.$|\\..$", "", names(raw_data))# Convert character columns to numeric# names(Filter(is.character, raw_data))col_convert <-c("Temp","pH","EC","Alcalinity","Fluorid","Chlorid","Nitrit","Nitrat","Sulfat","Phosphat","TC","IC","TOC","Natrium","Ammonium","Kalium","Calcium","Magnesium","SUVA250..aromaticity","SR..fulvic.acid.to.humic.acid","E2.E3..inverse.molecular.size","Tryptophan.like","Tyrosin.like","BIX..biological","HIX..aromaticity","FI...1.4..plant..soil")raw_data[, col_convert] <-sapply(raw_data[, col_convert], as.numeric)# Remove highly correlated variables# combine Na and Clraw_data$Na_Cl <- raw_data$Chlorid + raw_data$Natriumraw_data[, c("Chlorid", "Natrium")] <-NULL# Further delete: "Fluorid", "Nitrit", "Phosphat", "TC", "IC" und "Ammonium"raw_data[, c("Fluorid", "Nitrit", "Phosphat","TC", "IC")] <-NULL# Env. variablesenv_vars <-names(Filter(is.numeric, raw_data))raw_data_subset <- raw_data[, env_vars]# Filter out sites with missing values# 58 sites remainraw_data_subset <-na.omit(raw_data_subset[, env_vars])# str(raw_data_subset)``````{r}# Cluster analysis ----# Distance matrix# Vegan way, use z-standardised variablesdist_mat <-vegdist(scale(raw_data_subset), method ="euclidean")hc <-hclust(dist_mat, method ="ward.D2")# hc$order# Ade4 way:# Create datasets for individual data types# ade4 approach for mixed variables# d_Num <- Filter(is.numeric, raw_data)# d_Dich <- Filter(is.integer, raw_data)# ktab <- ktab.list.df(list(d_Num, d_Dich))# dist_mat <- dist.ktab(ktab, type = c("Q", "D"))## Calculate optimal number of groups ----# GAP Statisticgap <-clusGap(x =as.matrix(dist_mat),FUN = mycluster_hc,K.max =20,B =500)optimal_nog_gap <-maxSE(gap$Tab[, "gap"], gap$Tab[, "SE.sim"],method ="Tibs2001SEmax")# plot(gap)# Silhouette width# Suggests 19 clusters# Optimal number of groups on the basis of maximizing the dissimilarity between groups and minimizing the dissimilarity within groups.# TODO: Order of NbClust does not match with the dendrogram (but group assignment is right),# e.g. Rose lake is 19, and Emosson lake is 18, which would suggest that both are next to # each other in the dendrogramsilhouette <-NbClust(diss = dist_mat,distance =NULL,min.nc =2,max.nc =20,method ="ward.D",index ="silhouette")optimal_nog_sh <- silhouette$Best.nc[["Number_clusters"]]```### DendrogramBranches are colored according to the groups obtained by maximizing the silhouette width.```{r}#| fig-width: 20#| fig-height: 30#| column: screen-inset# Dendrogram plot, 19 groupspl_dend <- hc %>%as.dendrogram() %>%color_branches(k = optimal_nog_sh) %>%set("labels_cex", 0.7)pl_dend_gg <-as.ggdend(pl_dend)pl_dend_gg <-ggplot(pl_dend_gg, horiz =TRUE, theme =NULL) +theme_classic() +theme(axis.title =element_text(size =22),axis.text.x =element_text(family ="Roboto Mono", size =16),axis.text.y =element_text(family ="Roboto Mono",size =16) )pl_dend_gg``````{r}# Add groups to dataset# (according to the order in the dendrogram/hc)groups <-data.table(Site =names(cutree( hc,k = optimal_nog_sh,order_clusters_as_data =FALSE )),Group =cutree( hc,k = optimal_nog_sh,order_clusters_as_data =FALSE ))# reactable(# groups[order(Group), ],# filterable = TRUE,# highlight = TRUE,# defaultPageSize = 19# )``````{r}## "Centers" of each group ----# Then find those sites that are closest to the center# Or better make a ranking of the closeness to the centersetDT(raw_data)raw_data[groups, Group := i.Group, on ="Site"]# Rm 12 rows where env. variables have not been measuredraw_data <- raw_data[!is.na(Group), ]# Z-Standardize values for numeric variables# Might be a way to do this with scaleraw_data[, (env_vars) :=lapply(.SD, function(y) (y -mean(y)) /sd(y)), .SDcols = env_vars]# LF for calculating centroidraw_data_lf <-melt( raw_data,id.vars =c("Region", "number.of.sample", "Ort", "Remark", "Site", "Group"),variable.name ="Variable",value.name ="Value")# Calculate centroid & closest sites in terms of difference to the centroid?# Could use the SSEraw_data_lf[, Centroid :=mean(Value), by =c("Group", "Variable")]raw_data_lf[, Diff := (Value - Centroid)^2]raw_data_lf[, Sum_diff :=sum(Diff), by =c("Group", "Site")] # Calculate medoids# Here, order of the data is needed, not the dendrogram order (otherwise mediod is# not calculated per group)ind_medoids <-medoids(dist_mat, cutree(hc, k = optimal_nog_sh))# raw_data[ind_medoids, .(Site, Group)]```### Tables Centroids & MediodsSites ordered according to their clusters delineated by hierarchical cluster analysis (Wards method) and according to their squared sum of differences to the centroid.```{r}centroid_results <-unique(raw_data_lf[order(Group), .(Site, Group, Sum_diff)])reactable( centroid_results[order(Group, Sum_diff), .(Site, Group,Sum_diff =round(Sum_diff, digits =2))],columns =list(Sum_diff =colDef(name ="Sum Difference (p. site and group)")),filterable =TRUE,highlight =TRUE,defaultPageSize =29)```Sites closest to the centroid and mediods for each cluster.```{r}centroid_subset <- centroid_results[order(Group, Sum_diff), .SD[1], .SDcols ="Site", by = Group]setnames(centroid_subset,old ="Site",new ="Centroid")mediod_subset <- raw_data[ind_medoids, .(Mediod = Site, Group)]# Groups are not ordered in mediod subset. Hence, merge with centroid_subsetcentroid_subset[mediod_subset, Mediod := i.Mediod, on ="Group"]reactable( centroid_subset,highligh =TRUE,filterable =TRUE,defaultPageSize =20)```### Clusters & z-standardized values of environmental parametersNumbers indicate each delineated cluster. Values for each variable are z-standardised (Stand_value), i.e. they can be interpreted in terms of differences in standard deviations from the mean. Colored sites are those sites that have the overall smallest sum of squared differences to the centroid.```{r}# Dataset with summed differences to the cluster centroids/meansdata_summed_diff <-unique(raw_data_lf[order(Group, Sum_diff), .(Site, Group, Sum_diff)])representative_sites <- data_summed_diff[, .SD[1], by = Group][, Site]# Plotraw_data_lf[, Group :=factor(Group, levels =1:20)]raw_data_lf[, Site :=factor(Site, levels =c( representative_sites,setdiff(unique(raw_data_lf$Site), representative_sites)))]pl_sites <-ggplot(raw_data_lf) +geom_point(aes(x = Variable,y = Value,key = Site),size =2.5) +geom_point(data =~ .x[Site %in% representative_sites, ],aes(x = Variable,y = Value,color = Site),size =2.5) +geom_hline(aes(yintercept =0), linetype ="dotted") +facet_wrap(~ Group) +#labs(x = "Group", y = "Z-stand value") +coord_flip() +theme_bw() +theme(axis.title =element_text(size =16),axis.text.x =element_text(family ="Roboto Mono",size =12),axis.text.y =element_text(family ="Roboto Mono",size =12),legend.title =element_text(family ="Roboto Mono",size =16),legend.text =element_text(family ="Roboto Mono",size =14),strip.text =element_text(size =12) )``````{r}#| fig-width: 30#| fig-height: 50#| column: screen-insetggplotly(pl_sites, tooltip ="Site")``````{r}##### Complete overview results# Complete table of environmental variables and cluster analysis results. Values are rounded for better display. The Centroid is the mean for each variable and has been calculated on z-standardised values (i.e., its value is zero). Sum Difference is the squared sum of differences to the centroid for each site.#| column: screen-inset# reactable(# test_dat_lf[order(Group, Variable),# .(Site, # Season, # Group, # Variable, # Value, # Stand_value = round(Stand_value, digits = 2), # Centroid = round(Centroid, digits = 2), # Sum_diff = round(Sum_diff, digits = 2))],# columns = list(# Site = colDef(width = 120),# Season = colDef(width = 120),# Group = colDef(width = 120), # # Variable = colDef(width = 180),# # Value = colDef(width = 180),# Stand_value = colDef(name = "Z-Stand. Value"), # width = 160# Centroid = colDef(name = "Centroid (p. group and variable"), # width = 160# Sum_diff = colDef(name = "Sum Difference (p. site and group)") # width = 160# ),# filterable = TRUE,# highlight = TRUE,# defaultPageSize = 20# )```